home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
- Brewster@think.com
- */
-
-
- /* implements the search part of irext.h
- (search_word and finished_search_word)
- -brewster
-
- Split from irsearch.c
-
- 5/31/91 Added scale_scores. Fixed document_score_array to long.
- 7/8/91 Removed scale_scores, handled in search_word with doc_id > 0.
-
- - Jonny G
- */
-
- #include "cdialect.h"
- #include "irfiles.h"
- #include "irsearch.h"
- #include "irext.h"
- #include <string.h>
-
- /*===========================*
- *=== Setting Paramters ===*
- *===========================*/
-
- long max_hit_retrieved = 0;
-
- long set_query_parameter (mask, parameters)
- long mask;
- query_parameter_type * parameters;
- {
- switch (mask)
- {
- case SET_MAX_RETRIEVED_MASK:
- max_hit_retrieved = parameters->max_hit_retrieved;
- break;
- default:
- break;
- }
- }
-
- /*==============================*
- *=== Document Score Array ===*
- *==============================*/
-
- long *document_score_array = NULL;
- long document_score_array_len = 0;
-
- /* make_document_score_array insures that the document_score_array
- array is long enough, if not it makes it long enough */
- static void make_document_score_array _AP((long length));
- static void make_document_score_array(length)
- long length;
- {
- if(length <= document_score_array_len)
- return;
- /* we have to make a new one. free the old one first (if any) */
- if(document_score_array != 0){
- s_free(document_score_array);
- }
- document_score_array = (long*)s_malloc(
- (size_t)(length * sizeof(long)));
- document_score_array_len = length;
- }
-
- static void destroy_document_score_array _AP((void));
- static void destroy_document_score_array()
- {
- s_free(document_score_array);
- document_score_array_len = 0;
- }
-
- void clear_document_score_array()
- /* side effects the document_score_array. XXX could use memset */
- {
- memset(document_score_array, 0,
- document_score_array_len * sizeof(long));
- }
-
- /* for debugging purposes */
- void print_document_score_array(start,stop)
- unsigned long start;
- unsigned long stop;
- /* assumes start >= 0, stop < db->doc_table_allocated_entries */
- {
- long i;
- for(i = start; i <= stop; i++){
- printf("entry number %d: %d \n",
- i, (unsigned char)document_score_array[i]);
- }
- }
-
-
-
- /*=========================*
- *=== Best Hits Array ===*
- *=========================*/
-
- hit *best_hits_array = NULL;
- long best_hits_array_len = 0;
- long current_best_hit = 0;
-
- /* make_best_hits_array insures that the best_hits_array
- array is long enough, if not it makes it long enough */
- static void make_best_hits_array _AP((long length));
- static void make_best_hits_array(length)
- long length;
- {
- if(length <= best_hits_array_len)
- return;
- /* we have to make a new one. free the old one first (if any) */
- if(best_hits_array != 0){
- s_free(best_hits_array);
- }
- best_hits_array = (hit*)s_malloc((size_t)(length * sizeof(hit)));
- best_hits_array_len = length;
- }
-
- static void destroy_best_hits_array _AP((void));
- static void destroy_best_hits_array()
- {
- s_free(best_hits_array);
- best_hits_array_len = 0;
- }
-
- void clear_best_hits_array()
- /* side effects the best_hits_array. XXX could use memset */
- {
- memset((char*)best_hits_array, 0, best_hits_array_len * sizeof(hit));
- }
-
- /* for debugging purposes */
- void print_best_hits()
- {
- long i;
- for( i = 0; i < best_hits_array_len; i++){
- if (best_hits_array[i].weight != 0)
- { printf("Best hit %ld: weight %ld, doc_id %ld, headline %s, filename %s, lines %ld\n",
- i, best_hits_array[i].weight,
- best_hits_array[i].document_id,
- best_hits_array[i].headline,
- best_hits_array[i].filename,
- best_hits_array[i].number_of_lines);
- }
- }
- }
-
- void sort_best_hits(db)
- database * db;
- {
- /* returns nothing.
- * side effects best_hits and document_score_array
- */
-
- long i, doc;
- long worst_weight_to_make_it = 0;
- document_table_entry doc_entry;
- long best_hit_number = 0;
-
- /* snuff the scores */
- for(i = 0; i < max_hit_retrieved; i++){
- best_hits_array[i].weight = 0;
- }
-
- /* loop over the doc, and keep the doc_id and weight in best hit table */
- for(doc = 0; doc < db->doc_table_allocated_entries; doc++){
- long weight = document_score_array[doc];
- if(worst_weight_to_make_it < weight){
- /* merge it into the best_hits array. start at the bottom */
- for(i = (max_hit_retrieved - 1); i >= 0; i--){
- if(weight > best_hits_array[i].weight
- /* && (check_document_id(doc, db) == true) too slow.*/
- ){
- /* move this entry down */
- if((i + 1) < max_hit_retrieved){
- best_hits_array[i+1].weight = best_hits_array[i].weight;
- best_hits_array[i+1].document_id = best_hits_array[i].document_id;
- }
- best_hits_array[i].document_id = doc;
- best_hits_array[i].weight = weight;
- }
- else
- break;
- }
- }
- }
-
- for(i = 0; i < max_hit_retrieved; i++){
- if(best_hits_array[i].weight <= 0) /* if we are out of good stuff, return */
- return;
- /* fill in the rest of the hit */
- if (read_document_table_entry(&doc_entry,
- best_hits_array[i].document_id,
- db)
- == true){
- best_hits_array[best_hit_number].weight = best_hits_array[i].weight;
- best_hits_array[best_hit_number].document_id = best_hits_array[i].document_id;
- best_hits_array[best_hit_number].start_character = doc_entry.start_character;
- best_hits_array[best_hit_number].end_character = doc_entry.end_character;
- best_hits_array[best_hit_number].document_length = doc_entry.document_length;
- best_hits_array[best_hit_number].number_of_lines = doc_entry.number_of_lines;
- read_filename_table_entry(doc_entry.filename_id,
- best_hits_array[best_hit_number].filename,
- best_hits_array[best_hit_number].type,
- NULL,
- db),
- strncpy(best_hits_array[best_hit_number].headline,
- read_headline_table_entry(doc_entry.headline_id,db),
- MAX_FILE_NAME_LEN);
- best_hit_number++;
- }
- beFriendly();
- }
- for(i = best_hit_number; i < max_hit_retrieved; i++){
- best_hits_array[best_hit_number].weight = 0;
- }
- /* print_best_hits(s); for debugging */
- }
-
-
- /* returns the next best hit */
- long best_hit(doc_id, score)
- long *doc_id;
- long *score;
- {
- if(current_best_hit > best_hits_array_len)
- return(1);
- if(best_hits_array[current_best_hit].weight == 0)
- return(1);
- *doc_id = best_hits_array[current_best_hit].document_id;
- *score = best_hits_array[current_best_hit].weight;
- current_best_hit++;
- return(0);
- }
-
- long finished_best_hit()
- { /* if we are on a small machine, we might want to
- destroy_document_score_array */
- clear_document_score_array();
- clear_best_hits_array();
- current_best_hit = 0;
- return(0);
- }
-
- /*=============================*
- *=== Searching for words ===*
- *=============================*/
-
- long search_word(word,char_pos, line_pos, weight, doc_id, dictionary_value,
- db)
- char *word; /* the word to be searched for */
- long char_pos; /* the position of the start of the word */
- long line_pos; /* is this needed? not for signature system */
- long weight; /* how important the word looks syntactically,
- such as is it bold */
- long doc_id; /* current document, seed words is 0,
- then it increments into the relevant
- document */
- long dictionary_value; /* this is from the disk dictionary,
- a signature system would use weight,
- inverted file systems would put
- position information */
- database *db;
- {
- /* this side effects the document_score_array,
- * and downcases the word.
- * Returns 0 if successful or word not present,
- * returns non-0 if an error.
- *
- */
-
- long not_full_flag = INDEX_BLOCK_FULL_FLAG; /*start out full so it will go on looking */
- long count, index_block_size;
- long internal_document_id, internal_weight, number_of_valid_entries;
- long index_file_block_number = dictionary_value;
-
- FILE *stream = NULL;
- current_best_hit = 0; /* so that the best hits willstart from 0 */
-
- /* check the document_score_array */
- if(document_score_array_len < db->doc_table_allocated_entries)
- make_document_score_array(db->doc_table_allocated_entries);
-
- if(index_file_block_number >= 0){
- stream = db->index_stream;
-
- while((not_full_flag != INDEX_BLOCK_NOT_FULL_FLAG) &&
- (index_file_block_number != 0)){
- /* read the index block */
- if (0 != fseek(stream, (long)index_file_block_number,
- SEEK_SET))
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "fseek failed into the inverted file to position %ld",
- (long)index_file_block_number);
- return(-1);
- }
-
- not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream);
- index_file_block_number = read_bytes(NEXT_INDEX_BLOCK_SIZE, stream);
- index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream);
- if(EOF == index_block_size)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "reading from the index file failed");
- return(-1);
- }
-
- if(not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG){
- /* not full */
- number_of_valid_entries = index_file_block_number;
- }
- else if(not_full_flag == INDEX_BLOCK_FULL_FLAG){
- /* full */
- number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE;
- }
- else{ /* bad news, file is corrupted. */
- waislog(WLOG_HIGH, WLOG_ERROR,
- "Expected the flag in the inverted file to be valid. it is %ld",
- not_full_flag);
- return(-1);
- }
- /* printf("number of valid bytes: %ld\n", number_of_valid_entries); */
-
- /* add the array to the document_score_array */
- for(count = 0; count < number_of_valid_entries;
- count = count + INDEX_ELEMENT_SIZE){
- internal_document_id = read_bytes(DOCUMENT_ID_SIZE, stream);
- (void)read_bytes(WORD_POSITION_SIZE, stream);
- (void)read_bytes(CHARACTER_POSITION_SIZE, stream);
- internal_weight = read_bytes(WEIGHT_SIZE,stream);
- /* printf("entry %ld, Doc_id: %ld, weight %ld \n",
- count, internal_document_id, internal_weight); */
- if(EOF == internal_weight)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "reading from the doc-id table failed");
- return(-1);
- }
- if(doc_id > 0) /* we are doing a relevant document */
- internal_weight /= 0.1;
-
- document_score_array[internal_document_id] =
- document_score_array[internal_document_id] + internal_weight;
- }
- }
- return(0);
- }
- else if(0 == index_file_block_number){
- /* an error occurred on looking up the word */
- return(-1);
- }
- else /* index_file_block_number is negative */
- return(0); /* word not present */
- }
-
- /* now collect the best hits */
- long finished_search_word(db)
- database *db;
- {
- /* check the document_score_array */
- if(document_score_array_len < db->doc_table_allocated_entries)
- make_document_score_array(db->doc_table_allocated_entries);
-
- make_best_hits_array(max_hit_retrieved);
- sort_best_hits(db);
- return(0);
- }
-
-
-
-
-
-
-
-